import os
import os.path
import random
import math
import shutil
import ma.fs.dfs.dfsflags as dfsflags
import ma.const

main_gtbg_dir = "/state/partition1/gutenberg/"
main_dest_dir = "/state/partition1/datasets/"
map_chunk_size = 32 * 1024 * 1024                       # BEWARE: NOT ONE FILE INPUT SHOULD BE GREATER THAN THIS SIZE
job_id = 6
struct_id = 0
max_file_size = 0
replication_scaleup = 1
data_chunk_prcnt = 10
verbose_mode = 'n'


def dir_all_file_sizes(dir, files):
    global max_file_size
    total_size = 0
    for file in files:
        filepath = dir + os.sep + file
        size = os.path.getsize(filepath)
        if size > max_file_size:
            max_file_size = size
        total_size += size
    return total_size

        
all_catg = os.listdir(main_gtbg_dir)
mfs = ""
map_id = -1
file_no = 0

xml_str = dfsflags.DfsFlags._DfsFlags__mapflag_template

# iterate through all directories
for catg in all_catg:
    catg_dir = main_gtbg_dir + catg
    files = os.listdir(str(catg_dir))
    dir_size = dir_all_file_sizes(catg_dir, files)
    
    print("Processing directory:", catg_dir, " - Dir size:", dir_size)
    
    files = files * replication_scaleup
    print("Scaled by", replication_scaleup, "to get a scaled size of:", (dir_size * replication_scaleup))
    print("Chunked down by", data_chunk_prcnt, "to get a final structure size of:", (dir_size * replication_scaleup / data_chunk_prcnt))
     
    # no of maps
    # no_maps = int(math.ceil(float(dir_size) / map_chunk_size))
    
    # files in this dir
    map_size = 0
    catg_file = 1
    inputs = []
    out_filename = ma.const.JobsXmlData.get_str_data(ma.const.xml_map_input_filename, job_id, file_no)
    out_filepath = main_dest_dir + str(job_id) + os.sep + out_filename
    
    # ensure the destination directory exists
    if not os.path.exists(main_dest_dir + str(job_id)):
        os.makedirs(main_dest_dir + str(job_id))
    
    inputs = [(out_filename, 0, 0)]
    fd = open(out_filepath, "w+")
    print("Opening new file for struct")
    
    for filename in files:
        filepath = catg_dir + os.sep + filename
        size = os.path.getsize(filepath) / data_chunk_prcnt
        map_size += size
        
        if verbose_mode == 'y':
            print(filepath, "Map size", map_size)
        
        # read a random chunk in the file
        fr = open(filepath, "r")
        fr.seek(int(random.random() * data_chunk_prcnt) * size)
        file_str = fr.read(size)
        fr.close()
        
        if map_size > map_chunk_size:
            fd.close()
            
            if verbose_mode == 'y':            
                print("Opening new file for same struct")
            map_id += 1
            mfs += dfsflags.DfsFlags.create_map_flags_stub(map_id, inputs, struct_id)
            map_size = size
            
            file_no += 1
            catg_file += 1
            out_filename = ma.const.JobsXmlData.get_str_data(ma.const.xml_map_input_filename, job_id, file_no)
            out_filepath = main_dest_dir + str(job_id) + os.sep + out_filename
            inputs = [(out_filename, 0, 0)]
            fd = open(out_filepath, "w+")
            fd.write(file_str)
        else:
            # create stub for each file
            if verbose_mode == 'y':
                print("Appending file to same struct")
            fd.write(file_str)        
    
    if len(inputs) > 0:
        map_id += 1
        mfs += dfsflags.DfsFlags.create_map_flags_stub(map_id, inputs, struct_id)
    
    fd.close()
    file_no += 1
    
    print(catg_file, " maps created for struct", struct_id, "'", catg, "'")
    
    # structure progress
    struct_id += 1

xml_str = xml_str % mfs

# write to file
mf_filename = ma.const.XmlData.get_str_data(ma.const.xml_map_flags_filename)
mf_fd = open(mf_filename, 'w+')
mf_fd.write(xml_str)
mf_fd.close()

print("Max file size", max_file_size)
print("No. of maps", file_no)
print("ENDED!")
